Load R packages and define colour functions

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(knitr) ; library(kableExtra)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

# Get colors from the ggplot palette
gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
  
  n = length(unique(mods))-1
  set.seed(123) ; rand_order = sample(1:n)
  mod_colors = c('white', gg_colour_hue(n)[rand_order])
  names(mod_colors) = mods %>% table %>% names
  
  return(mod_colors)
}

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame

# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')

# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>% 
             left_join(datGenes %>% mutate(ID = ensembl_gene_id) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
             dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
             left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))


rm(dds, WGCNA_metrics)




3.3.1 Top clusters by Cluster-diagnosis correlation


Selecting the top clusters

Modules with a cluster-diagnosis correlation magnitude larger than 0.8

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor) %>% distinct %>% 
            mutate(alpha = ifelse(abs(MTcor)>0.8, 1, 0.3))

top_modules = plot_data %>% arrange(desc(MTcor)) %>% filter(abs(MTcor) > 0.8) %>% pull(Module) %>% as.character

The 2 modules that fulfill this condition are clusters 37, 49

ggplotly(plot_data %>% mutate(module_number=ifelse(module_number == max(module_number), 'No cluster', 
                                                   paste('Cluster', module_number))) %>%
         ggplot(aes(reorder(module_number, -MTcor), MTcor)) + 
         geom_bar(stat='identity', fill = plot_data$Module, alpha = plot_data$alpha) + 
         geom_hline(yintercept =c(0.8, -0.8), color = 'gray', linetype = 'dotted') + 
         xlab('Clusters')+ ylab('Cluster-diagnosis correlation') + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))

The modules consist mainly of points with high values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules

The genes belonging to the modules with the positive Module-Diagnosis correlation have negative LFC values.

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 37  99
   Cluster 49  83
   Others  15344
   #Total cases  15526
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control

In both cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])

grid.arrange(p1, p2, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2)


Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score), 
  #                                                     paste('Score',as.character(gene.score))))) %>% 
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +  
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() + 
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
#create_plot(top_modules[3])

rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 37 (MTcor = 0.81)
Gene Symbol MM GS SFARI Score Relevance
IMMP2L 0.8300904 0.8030811 3 0.8165858
CMC1 0.7564467 0.8304827 Not in SFARI 0.7934647
PHC3 0.8894820 0.6764330 Not in SFARI 0.7829575
SLC35F5 0.8228983 0.7405212 Not in SFARI 0.7817098
POLK 0.6737194 0.8537244 Not in SFARI 0.7637219
TRDMT1 0.8494942 0.6750133 Not in SFARI 0.7622537
ANKRD10 0.7833334 0.7394511 Not in SFARI 0.7613922
POU5F2 0.8031093 0.7005633 Not in SFARI 0.7518363
FSIP2 0.7562759 0.7281805 Not in SFARI 0.7422282
NBEAL1 0.8074616 0.6645718 Not in SFARI 0.7360167
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 49 (MTcor = 0.8)
Gene Symbol MM GS SFARI Score Relevance
ALG10 0.8583644 0.8089918 Not in SFARI 0.8336781
ZBTB41 0.8656532 0.7160610 Not in SFARI 0.7908571
ZDHHC21 0.7684366 0.7716765 Not in SFARI 0.7700566
NHLRC2 0.7554051 0.7502343 Not in SFARI 0.7528197
GJA9 0.5770159 0.8635066 Not in SFARI 0.7202612
ACAD11 0.5794607 0.8595972 Not in SFARI 0.7195289
ZNF483 0.8094988 0.6267750 Not in SFARI 0.7181369
DCP2 0.7708707 0.6568334 Not in SFARI 0.7138521
NUP43 0.7066426 0.7173994 Not in SFARI 0.7120210
SMAD5 0.7384460 0.6709100 Not in SFARI 0.7046780
rm(create_table, i)

Most of the top genes, idenpendently of the cluster, have very high values for the 1st principal component, but not for the 2nd one, as the other datasets

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


0/30 genes are differentially expressed

1/30 genes are SFARI Genes (IMMP2L)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Subject_ID, Diagnosis),
                        by = c('variable'='Subject_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # For thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) + 
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
#create_plot(3)


rm(create_plot)





3.3.2 Top clusters by enrichment in SFARI Genes


Selecting the top clusters

Using ORA, as it was done in the previous section and selecting as top clusters the ones with an adjusted p-value lower than \(10^{-3}\) (enrichment higher than 0.999)

# Calculate % and ORA of SFARI Genes in each module

#modules = unique(genes_info$Module[genes_info$Module!='gray']) %>% as.character
modules = unique(genes_info$Module) %>% as.character

# We need the entrez ID of the genes for this
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                       values=genes_info$ID, mart=mart) %>%
                 left_join(genes_info %>% dplyr::select(ID, Module,gene.score), by = c('ensembl_gene_id'='ID'))

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(gene.score == 'Others', 'Others', 'SFARI'), 
                                      'gene' = entrezgene) %>%  dplyr::select(term, gene) %>% distinct


enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
                             'pval_ORA' = 0, 'padj_ORA' = 0)

for(i in 1:length(modules)){
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>% 
                        data.frame %>% dplyr::select(-geneID,-Description)
  ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
  ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
  enrichment_data[i,-1] = c(length(genes_in_module), 
                            mean(genes_info$gene.score[genes_info$Module==module]!='Others'), 
                            ORA_pval, ORA_padj)
}

enrichment_data = enrichment_data %>% 
                  left_join(genes_info %>% dplyr::select(Module) %>% unique, by = 'Module')

genes_info = genes_info %>% left_join(enrichment_data, by = 'Module') 

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor, size, padj_ORA) %>% distinct %>% 
            mutate(alpha = ifelse(abs(1-padj_ORA)>0.999, 1, 0.3))

top_modules = plot_data %>% arrange(padj_ORA) %>% filter((1-padj_ORA) > 0.999) %>% pull(Module) %>% as.character

rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)

The 4 modules that fulfill this condition are clusters 28, 35, 57, 70

ggplotly(plot_data %>% ggplot(aes(MTcor, padj_ORA, size=size)) + 
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=module_number)) +
         geom_hline(yintercept = 0.001, color = 'gray', linetype = 'dotted') + 
         xlab('Cluster-diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none') +
         ggtitle('Clusters Significantly Enriched in SFARI Genes'))
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 28  49
   Cluster 35  1421
   Cluster 57  80
   Cluster 70  271
   Others  13705
   #Total cases  15526
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


The Eigengenes no longer separate well the behaviour between ASD and control.

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])
p4 = plot_EGs(top_modules[4])

grid.arrange(p1, p2, p3, p4, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2, p3, p4)



Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score),
  #                                                     paste('Score',as.character(gene.score))))) %>%
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal',
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() +
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
create_plot(top_modules[4])
rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.', gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 57 (MTcor = 0.02)
Gene Symbol MM GS SFARI Score Relevance
SETBP1 0.8747576 0.2452886 1 0.5600231
CSMD1 0.8570502 -0.1996188 3 0.5283345
TNRC6B 0.7615816 -0.2704364 2 0.5160090
CACNA1D 0.8335130 -0.1983921 2 0.5159525
UNC79 0.7170968 0.2947462 2 0.5059215
UBAP1L 0.7252948 -0.2864899 Not in SFARI 0.5058924
C12orf45 0.6403891 0.3534496 Not in SFARI 0.4969194
AAK1 0.7182544 0.2506274 Not in SFARI 0.4844409
LRFN2 0.7855393 -0.1619271 3 0.4737332
CYP2A7 0.6361294 -0.2824547 Not in SFARI 0.4592921
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 35 (MTcor = -0.29)
Gene Symbol MM GS SFARI Score Relevance
INF2 0.7844770 -0.6004869 Not in SFARI 0.6924819
NOVA2 0.8877885 -0.4821928 Not in SFARI 0.6849907
TTLL11 0.7057850 -0.6505207 Not in SFARI 0.6781529
TUBB4A 0.6468887 -0.7086364 Neuronal 0.6777626
FN3K 0.8506776 -0.4897078 Not in SFARI 0.6701927
WDR20 0.5488395 -0.7775068 Not in SFARI 0.6631732
OTUD7A 0.8155851 -0.5054212 2 0.6605031
NLGN3 0.8393498 -0.4683870 1 0.6538684
NRXN3 0.8181433 -0.4822748 1 0.6502091
POM121C 0.8225026 -0.4703754 Not in SFARI 0.6464390
kable(top_genes[[3]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 28 (MTcor = -0.34)
Gene Symbol MM GS SFARI Score Relevance
SLC45A1 0.8170232 -0.5497707 Not in SFARI 0.6833969
DAGLA 0.7763599 -0.5817339 3 0.6790469
KLC2 0.6871678 -0.6066124 Neuronal 0.6468901
SLC12A5 0.7575938 -0.4280150 2 0.5928044
SPATS2 0.5886467 -0.5517679 Not in SFARI 0.5702073
CHRNB2 0.7064857 -0.4101582 Neuronal 0.5583220
DLG4 0.7977514 -0.2997124 1 0.5487319
CLSTN1 0.8730686 -0.2223519 Not in SFARI 0.5477102
KCNA1 0.6862300 -0.4054832 Neuronal 0.5458566
SUSD4 0.6624899 -0.3726739 Not in SFARI 0.5175819
kable(top_genes[[4]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[4]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 28 (MTcor = 0.38)
Gene Symbol MM GS SFARI Score Relevance
LRP1B 0.8494493 0.5048267 Not in SFARI 0.6771380
C8orf34 0.6482112 0.6790534 Not in SFARI 0.6636323
TENM1 0.8035260 0.4925949 Neuronal 0.6480604
GABRA5 0.7812392 0.4996582 Neuronal 0.6404487
SLC26A11 0.6600965 0.5969309 Not in SFARI 0.6285137
CXXC4 0.7241745 0.5296116 Not in SFARI 0.6268931
PEG10 0.5546908 0.6955403 Not in SFARI 0.6251155
GNG2 0.7312157 0.5062701 Not in SFARI 0.6187429
GNB4 0.6930771 0.5407956 Not in SFARI 0.6169363
RALGPS2 0.6892086 0.5354183 Not in SFARI 0.6123135
rm(create_table, i)

Top genes don’t have very high PC2 values

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


0/30 genes are differentially expressed

12/30 genes are SFARI Genes (OTUD7A, DLG4, TNRC6B, SETBP1, NLGN3, SLC12A5, LRFN2, UNC79, CSMD1, DAGLA, CACNA1D, NRXN3)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes, but it’s not as strong as the differences in the top cluster by cluster-diagnosis correlation

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Subject_ID, Diagnosis),
                        by = c('variable'='Subject_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # # For the thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) +
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
create_plot(3)
create_plot(4)
rm(create_plot)




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0       knitr_1.32             org.Hs.eg.db_3.8.2    
##  [4] AnnotationDbi_1.46.1   IRanges_2.18.3         S4Vectors_0.22.1      
##  [7] Biobase_2.44.0         BiocGenerics_0.30.0    DOSE_3.10.2           
## [10] ReactomePA_1.28.0      clusterProfiler_3.12.0 biomaRt_2.40.5        
## [13] polycor_0.7-10         expss_0.10.7           WGCNA_1.69            
## [16] fastcluster_1.2.3      dynamicTreeCut_1.63-1  ggExtra_0.9           
## [19] ggpubr_0.2.5           magrittr_2.0.1         GGally_1.5.0          
## [22] colorspace_2.0-2       gridExtra_2.3          viridis_0.6.1         
## [25] viridisLite_0.4.0      RColorBrewer_1.1-2     dendextend_1.15.1     
## [28] plotly_4.9.2           glue_1.4.2             reshape2_1.4.4        
## [31] forcats_0.5.0          stringr_1.4.0          dplyr_1.0.1           
## [34] purrr_0.3.4            readr_1.3.1            tidyr_1.1.0           
## [37] tibble_3.1.2           ggplot2_3.3.5          tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2            tidyselect_1.1.1      RSQLite_2.2.0        
##   [4] htmlwidgets_1.5.3     grid_3.6.3            BiocParallel_1.18.1  
##   [7] munsell_0.5.0         codetools_0.2-16      preprocessCore_1.46.0
##  [10] miniUI_0.1.1.1        withr_2.4.2           GOSemSim_2.10.0      
##  [13] highr_0.9             rstudioapi_0.13       ggsignif_0.6.2       
##  [16] labeling_0.4.2        urltools_1.7.3        polyclip_1.10-0      
##  [19] bit64_4.0.5           farver_2.1.0          vctrs_0.3.8          
##  [22] generics_0.1.0        xfun_0.25             R6_2.5.1             
##  [25] doParallel_1.0.15     graphlayouts_0.7.0    bitops_1.0-7         
##  [28] cachem_1.0.6          reshape_0.8.8         fgsea_1.10.1         
##  [31] gridGraphics_0.5-1    assertthat_0.2.1      promises_1.2.0.1     
##  [34] scales_1.1.1          ggraph_2.0.3          nnet_7.3-14          
##  [37] enrichplot_1.4.0      gtable_0.3.0          tidygraph_1.2.0      
##  [40] rlang_0.4.11          splines_3.6.3         lazyeval_0.2.2       
##  [43] acepack_1.4.1         impute_1.58.0         broom_0.7.0          
##  [46] europepmc_0.4         checkmate_2.0.0       BiocManager_1.30.16  
##  [49] yaml_2.2.1            modelr_0.1.6          crosstalk_1.1.1      
##  [52] backports_1.2.1       httpuv_1.6.1          qvalue_2.16.0        
##  [55] Hmisc_4.4-0           tools_3.6.3           ggplotify_0.1.0      
##  [58] ellipsis_0.3.2        jquerylib_0.1.4       ggridges_0.5.3       
##  [61] Rcpp_1.0.7            plyr_1.8.6            base64enc_0.1-3      
##  [64] progress_1.2.2        RCurl_1.98-1.4        prettyunits_1.1.1    
##  [67] rpart_4.1-15          cowplot_1.1.1         haven_2.2.0          
##  [70] ggrepel_0.9.1         cluster_2.1.0         fs_1.5.0             
##  [73] data.table_1.14.0     DO.db_2.9             triebeard_0.3.0      
##  [76] reprex_0.3.0          reactome.db_1.68.0    matrixStats_0.60.1   
##  [79] hms_1.1.0             mime_0.11             evaluate_0.14        
##  [82] xtable_1.8-4          XML_3.99-0.3          jpeg_0.1-9           
##  [85] readxl_1.3.1          compiler_3.6.3        crayon_1.4.1         
##  [88] htmltools_0.5.2       later_1.3.0           Formula_1.2-4        
##  [91] lubridate_1.7.10      DBI_1.1.1             tweenr_1.0.2         
##  [94] dbplyr_1.4.2          MASS_7.3-53           rappdirs_0.3.3       
##  [97] Matrix_1.2-18         cli_3.0.1             igraph_1.2.6         
## [100] pkgconfig_2.0.3       rvcheck_0.1.8         foreign_0.8-76       
## [103] xml2_1.3.2            foreach_1.5.1         bslib_0.3.0          
## [106] webshot_0.5.2         rvest_0.3.5           yulab.utils_0.0.2    
## [109] digest_0.6.27         graph_1.62.0          rmarkdown_2.7        
## [112] cellranger_1.1.0      fastmatch_1.1-3       htmlTable_1.13.3     
## [115] curl_4.3.2            shiny_1.6.0           graphite_1.30.0      
## [118] lifecycle_1.0.0       jsonlite_1.7.2        fansi_0.5.0          
## [121] pillar_1.6.2          lattice_0.20-41       fastmap_1.1.0        
## [124] httr_1.4.2            survival_3.2-7        GO.db_3.8.2          
## [127] UpSetR_1.4.0          png_0.1-7             iterators_1.0.13     
## [130] bit_4.0.4             ggforce_0.3.1         stringi_1.7.4        
## [133] sass_0.4.0            blob_1.2.2            latticeExtra_0.6-29  
## [136] memoise_2.0.0